# Logistic Regression

## Part 1 - Data Preprocessing

### Importing the dataset

In [10]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('winequality-red.csv') 

In [11]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [12]:
dataset.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


### Data Cleaning

In [13]:
dataset = dataset.drop_duplicates()
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
5,7.4,0.660,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1593,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [17]:
dataset['quality'] = np.where(dataset['quality'] <= 5, 2, dataset['quality'])
dataset['quality'] = np.where(dataset['quality'] > 5, 4, dataset['quality'])

dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,2
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,4
5,7.4,0.660,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1593,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,4
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,2
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,4
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,2


### Getting the inputs and output

In [18]:
X = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,-1].values

In [19]:
X

array([[ 0.7  ,  0.   ,  1.9  , ...,  3.51 ,  0.56 ,  9.4  ],
       [ 0.88 ,  0.   ,  2.6  , ...,  3.2  ,  0.68 ,  9.8  ],
       [ 0.76 ,  0.04 ,  2.3  , ...,  3.26 ,  0.65 ,  9.8  ],
       ...,
       [ 0.55 ,  0.1  ,  2.2  , ...,  3.52 ,  0.76 , 11.2  ],
       [ 0.645,  0.12 ,  2.   , ...,  3.57 ,  0.71 , 10.2  ],
       [ 0.31 ,  0.47 ,  3.6  , ...,  3.39 ,  0.66 , 11.   ]])

In [20]:
y

array([2, 2, 2, ..., 4, 2, 4])

### Creating the Training Set and the Test Set

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [22]:
X_train

array([[ 0.6 ,  0.25,  2.2 , ...,  3.15,  0.53,  9.8 ],
       [ 0.41,  0.41,  6.2 , ...,  3.24,  0.77, 12.6 ],
       [ 0.34,  0.42,  1.8 , ...,  3.18,  0.55, 11.4 ],
       ...,
       [ 0.54,  0.37, 15.4 , ...,  3.18,  0.77,  9.  ],
       [ 0.67,  0.05,  3.6 , ...,  3.4 ,  0.63, 10.1 ],
       [ 0.61,  0.02,  2.5 , ...,  3.48,  0.6 ,  9.7 ]])

In [23]:
X_test

array([[ 0.48,  0.07,  5.5 , ...,  3.37,  0.68, 11.2 ],
       [ 0.58,  0.25,  2.8 , ...,  3.23,  0.57,  9.7 ],
       [ 0.36,  0.31,  1.7 , ...,  3.46,  0.62,  9.5 ],
       ...,
       [ 0.98,  0.32,  2.3 , ...,  3.25,  0.48,  9.4 ],
       [ 0.57,  0.27,  2.3 , ...,  3.27,  0.55,  9.4 ],
       [ 0.46,  0.  ,  1.9 , ...,  3.5 ,  0.53, 11.2 ]])

In [24]:
y_train

array([2, 4, 2, ..., 4, 2, 4])

In [25]:
y_test

array([4, 2, 4, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2, 2, 4, 2, 4, 4, 2, 4, 4, 4,
       2, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 4, 2, 2, 4, 2, 4,
       4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2,
       2, 2, 2, 2, 4, 4, 2, 2, 4, 2, 4, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 2,
       2, 4, 4, 4, 2, 4, 2, 4, 4, 2, 4, 4, 4, 4, 2, 4, 2, 2, 2, 2, 4, 2,
       4, 2, 2, 4, 2, 2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 2, 2, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2,
       2, 2, 4, 2, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 2, 2,
       4, 4, 2, 4, 4, 2, 4, 2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4,
       4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 4, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4,
       4, 4, 2, 2, 4, 2, 2, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 2, 4,
       4, 2, 4, 4, 4, 2, 4, 4, 2, 4, 4, 2, 2, 4, 2, 4, 4, 2, 4, 4, 2, 4,
       4, 2, 4, 4, 2, 2, 2, 2])

### Feature Scaling

In [26]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [27]:
X_train

array([[ 3.63998748e-01, -9.32959162e-02, -2.33081600e-01, ...,
        -1.05347299e+00, -7.57431334e-01, -5.81274566e-01],
       [-6.73135300e-01,  7.23313331e-01,  2.95784660e+00, ...,
        -4.63067040e-01,  7.21047427e-01,  2.02959284e+00],
       [-1.05523732e+00,  7.74351409e-01, -5.52174420e-01, ...,
        -8.56671006e-01, -6.34224770e-01,  9.10649666e-01],
       ...,
       [ 3.64827331e-02,  5.19161019e-01,  1.02969815e+01, ...,
        -8.56671006e-01,  7.21047427e-01, -1.32723668e+00],
       [ 7.46100766e-01, -1.11405748e+00,  8.83743270e-01, ...,
         5.86543537e-01, -1.41398517e-01, -3.01538772e-01],
       [ 4.18584751e-01, -1.26717171e+00,  6.23801511e-03, ...,
         1.11134883e+00, -3.26208362e-01, -6.74519831e-01]])

## Part 2 - Building and training the model

### Building the model

In [28]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression(random_state=0)

### Training the model

In [29]:
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(sc.transform(X_test))

In [31]:
y_pred

array([4, 2, 2, 2, 4, 4, 4, 4, 2, 4, 2, 4, 4, 2, 4, 4, 4, 4, 2, 4, 4, 2,
       4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 2, 2, 4, 4, 4, 2, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, 2, 2, 2,
       2, 4, 2, 2, 2, 4, 2, 2, 4, 4, 4, 2, 4, 2, 2, 2, 4, 4, 4, 4, 2, 4,
       4, 4, 4, 4, 2, 4, 2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 2, 2, 2, 4, 2,
       2, 2, 2, 4, 2, 4, 4, 4, 2, 4, 2, 2, 2, 4, 4, 2, 4, 4, 2, 2, 4, 4,
       4, 2, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4,
       2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 4, 2, 2, 4, 4, 4, 2, 2, 2, 4, 2, 2,
       4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2,
       4, 4, 2, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 2, 4, 2, 4, 4, 4, 4,
       2, 4, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4,
       4, 2, 4, 4, 4, 2, 4, 2, 2, 2, 4, 2, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4,
       4, 2, 4, 2, 2, 2, 2, 4])

## Part 3: Evaluating the model 

### Confusion Matrix

In [39]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, y_pred)
confusion

array([[ 85,  39],
       [ 38, 110]])

### Accuracy

In [43]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7169117647058824