# Logistic Regression

## Part 1 - Data Preprocessing

### Importing the dataset

In [412]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('winequality-red.csv') 

In [413]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [414]:
dataset.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [415]:
# Define categorical functions for each feature
def acidity_category(value):
    return '1' if value > 10 else '0'

def sugar_category(value):
    return '1' if value > 3 else '0'

def chlorides_category(value):
    return '1' if value > 0.08 else '0'

def sulfur_dioxide_category(value):
    return '1' if value > 50 else '0'

def alcohol_category(value):
    return '1' if value > 10 else '0'

In [416]:
# Apply the functions to create new categorical columns
dataset['acidity_level'] = dataset['fixed acidity'].apply(acidity_category)
dataset['sugar_level'] = dataset['residual sugar'].apply(sugar_category)
dataset['chlorides_level'] = dataset['chlorides'].apply(chlorides_category)
dataset['sulfur_dioxide_level'] = dataset['total sulfur dioxide'].apply(sulfur_dioxide_category)
dataset['alcohol_level'] = dataset['alcohol'].apply(alcohol_category)

In [417]:
# Display the first few rows of the updated dataset
print(dataset.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality acidity_level sugar_level chlorides_level  \
0      9.4        5  

In [418]:
dataset['quality'] = np.where(dataset['quality'] <= 5, 2, dataset['quality'])
dataset['quality'] = np.where(dataset['quality'] > 5, 4, dataset['quality'])
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,acidity_level,sugar_level,chlorides_level,sulfur_dioxide_level,alcohol_level
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2,0,0,0,0,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,2,0,0,1,1,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,2,0,0,1,1,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,4,1,0,0,1,0
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,2,0,0,1,0,1
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,4,0,0,0,1,1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,4,0,0,0,0,1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,2,0,0,0,0,1


### Getting the inputs and output

In [419]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [420]:
X

array([[7.4, 0.7, 0.0, ..., '0', '0', '0'],
       [7.8, 0.88, 0.0, ..., '0', '1', '1'],
       [7.8, 0.76, 0.04, ..., '0', '1', '1'],
       ...,
       [6.3, 0.51, 0.13, ..., '0', '0', '0'],
       [5.9, 0.645, 0.12, ..., '0', '0', '0'],
       [6.0, 0.31, 0.47, ..., '1', '0', '0']], dtype=object)

In [421]:
y

array(['0', '0', '0', ..., '1', '1', '1'], dtype=object)

### Creating the Training Set and the Test Set

In [422]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [423]:
X_train

array([[9.9, 0.54, 0.45, ..., '0', '0', '0'],
       [10.8, 0.26, 0.45, ..., '1', '0', '0'],
       [9.9, 0.35, 0.55, ..., '0', '0', '0'],
       ...,
       [7.9, 0.57, 0.31, ..., '0', '0', '1'],
       [13.0, 0.47, 0.49, ..., '1', '1', '0'],
       [9.8, 0.98, 0.32, ..., '0', '0', '1']], dtype=object)

In [424]:
X_test

array([[10.8, 0.47, 0.43, ..., '0', '1', '1'],
       [8.1, 0.82, 0.0, ..., '1', '1', '0'],
       [9.1, 0.29, 0.33, ..., '0', '0', '0'],
       ...,
       [9.1, 0.34, 0.42, ..., '0', '0', '0'],
       [9.1, 0.765, 0.04, ..., '0', '0', '0'],
       [8.2, 0.32, 0.42, ..., '0', '1', '0']], dtype=object)

In [425]:
y_train

array(['0', '0', '1', ..., '0', '1', '0'], dtype=object)

In [426]:
y_test

array(['1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '1', '1',
       '1', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '0', '0',
       '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0', '1',
       '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0',
       '1', '1', '0', '1', '0', '0', '1', '0', '0', '1', '1', '1', '0',
       '1', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0',
       '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '0', '0', '0',
       '0', '1', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1', '1',
       '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0',
       '1', '1', '1', '0', '1', '0', '0', '1', '1', '0', '0', '1', '1',
       '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '1', '0', '0',
       '0', '0', '0', '1', '1', '1', '0', '1', '1', '0', '1', '0', '1',
       '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '1',
       '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '0', '0

### Feature Scaling

In [427]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [428]:
X_train

array([[ 0.90103398,  0.05480282,  0.9094138 , ..., -0.39813879,
        -0.88815043, -0.72794526],
       [ 1.41998736, -1.47967601,  0.9094138 , ...,  2.51168697,
        -0.88815043, -0.72794526],
       [ 0.90103398, -0.98645067,  1.4208416 , ..., -0.39813879,
        -0.88815043, -0.72794526],
       ...,
       [-0.25219574,  0.21921126,  0.19341488, ..., -0.39813879,
        -0.88815043,  1.37372966],
       [ 2.68854005, -0.32881689,  1.11398492, ...,  2.51168697,
         1.12593539, -0.72794526],
       [ 0.84337249,  2.46612668,  0.24455766, ..., -0.39813879,
        -0.88815043,  1.37372966]])

## Part 2 - Building and training the model

### Building the model

In [429]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression(random_state=0)

### Training the model

In [430]:
model.fit(X_train, y_train)

In [431]:
y_pred = model.predict(sc.transform(X_test))

In [432]:
y_pred

array(['1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '1', '1',
       '1', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '0', '0',
       '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0', '1',
       '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0',
       '1', '1', '0', '1', '0', '0', '1', '0', '0', '1', '1', '1', '0',
       '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0',
       '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '0', '0', '0',
       '0', '1', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1', '1',
       '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0',
       '1', '1', '1', '0', '1', '0', '0', '1', '1', '0', '0', '1', '1',
       '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '1', '0', '0',
       '0', '0', '0', '1', '1', '1', '0', '1', '1', '0', '1', '0', '1',
       '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '1',
       '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '0', '0

### INFERENCE

In [433]:
y_pred = model.predict(sc.transform(X_test))

In [434]:
y_pred

array(['1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '1', '1',
       '1', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '0', '0',
       '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0', '1',
       '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0',
       '1', '1', '0', '1', '0', '0', '1', '0', '0', '1', '1', '1', '0',
       '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0',
       '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '0', '0', '0',
       '0', '1', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1', '1',
       '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0',
       '1', '1', '1', '0', '1', '0', '0', '1', '1', '0', '0', '1', '1',
       '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '1', '0', '0',
       '0', '0', '0', '1', '1', '1', '0', '1', '1', '0', '1', '0', '1',
       '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '1',
       '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '0', '0

In [435]:
y_test

array(['1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '1', '1',
       '1', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '0', '0',
       '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0', '1',
       '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0',
       '1', '1', '0', '1', '0', '0', '1', '0', '0', '1', '1', '1', '0',
       '1', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '0', '0',
       '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '0', '0', '0',
       '0', '1', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1', '1',
       '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0',
       '1', '1', '1', '0', '1', '0', '0', '1', '1', '0', '0', '1', '1',
       '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '1', '0', '0',
       '0', '0', '0', '1', '1', '1', '0', '1', '1', '0', '1', '0', '1',
       '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '1',
       '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '0', '0

Making the prediction of a single data point with:

1. Fixed acidity = 4.5
2. Volatile acidity = 1
3. Citric Acid = 2
4. Residual sugar = 3
5. Chlorides = 4
6. Free sulfur = 5
7. Total sulfur = 6
8. Density = 7
9. pH = 8
10. Sulphates = 9
11. Alcohol = 10

In [436]:
model.predict(sc.transform([[7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,0,0,0,0]]))

array(['0'], dtype=object)

## Part 3: Evaluating the model 

### Confusion Matrix

In [437]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, y_pred)
confusion

array([[146,   1],
       [  2, 171]])

### Accuracy

In [438]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.990625