In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Importing Data

In [2]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 

### Feature Engineering

In [5]:
x = wine_quality.data.features
y = wine_quality.data.targets
x.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


### Creating Model

import statsmodels.api as sm
def backward_elemination(data,target,significance_level):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features])
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_values = p_values.max()
        if(max_p_values >= significance_level):
            excluded_feature = p_values.idmax()
            features.remove(excluded_feature)
        else :
            break
    return features
backward_elemination(x,y,0.005)

In [6]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [7]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
x = sc.fit_transform(x)

In [8]:
from sklearn.model_selection import train_test_split
x_train , x_test, y_train , y_test = train_test_split(x,y,test_size=0.15,random_state=0)
x_train.shape

(5522, 11)

In [9]:
reg.fit(x_train,y_train)

In [10]:
#Checking the score  
print('Train Score: ', reg.score(x_train, y_train))  
print('Test Score: ', reg.score(x_test, y_test))  

Train Score:  0.2930489486628255
Test Score:  0.2861082684067934


### Testing

In [11]:
y_predict = reg.predict(x_test)
round(y_predict[0][0])

5

In [12]:
y_predict_round = []
for i in range(len(y_predict)):
    y_predict_round.append(round(y_predict[i][0]))
y_predict_round = np.array(y_predict_round)
y_predict_round

array([5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 7, 6, 5, 5, 6, 7, 6,
       6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 7, 6, 6, 6, 6, 7, 6, 6, 5, 6, 5, 6,
       6, 5, 6, 5, 5, 6, 5, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 7, 7,
       6, 5, 6, 6, 6, 5, 5, 6, 7, 5, 5, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 5,
       5, 6, 5, 6, 6, 6, 5, 6, 7, 6, 5, 6, 6, 5, 6, 5, 6, 6, 5, 6, 6, 5,
       6, 6, 5, 7, 6, 6, 5, 5, 7, 6, 5, 7, 7, 6, 6, 5, 6, 6, 5, 6, 6, 6,
       5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 7,
       6, 6, 6, 7, 6, 7, 6, 6, 6, 7, 5, 6, 5, 5, 6, 6, 6, 5, 6, 7, 6, 6,
       5, 5, 7, 6, 5, 6, 6, 7, 6, 6, 5, 6, 6, 7, 6, 6, 6, 5, 6, 6, 6, 6,
       6, 6, 5, 6, 6, 6, 5, 6, 5, 5, 6, 5, 6, 6, 6, 6, 5, 5, 5, 6, 6, 6,
       6, 6, 5, 6, 5, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 5, 6, 6, 5, 6, 6, 5,
       6, 6, 7, 6, 6, 6, 5, 7, 6, 6, 5, 6, 6, 5, 6, 6, 7, 7, 6, 5, 6, 6,
       5, 5, 6, 6, 6, 6, 6, 7, 6, 6, 7, 6, 6, 6, 7, 6, 6, 7, 6, 5, 7, 6,
       6, 6, 5, 6, 6, 6, 5, 6, 6, 5, 7, 6, 7, 7, 6,

In [13]:
y_test.shape

(975, 1)

### Evaluate Accuracy

In [16]:
from sklearn.metrics import accuracy_score,mean_absolute_error,r2_score,confusion_matrix
mean_absolute_error(y_test,y_predict_round)

0.5466666666666666

In [17]:
accuracy_score(y_predict_round,y_test)

0.5087179487179487

In [18]:
confusion_matrix(y_predict_round,y_test)

array([[  0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,   0,   0,   0,   0],
       [  3,  18, 135,  77,   8,   0,   0],
       [  2,  16, 166, 323, 114,  21,   0],
       [  0,   0,   1,  41,  38,  10,   1],
       [  0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0]], dtype=int64)