In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

%matplotlib inline

### Let's make a multivariate linear regression

The following dataset is also common, referred to as the Wine Quality Dataset. A local copy of this dataset is provided in Moodle.
https://archive.ics.uci.edu/ml/datasets/wine+quality

In [2]:
wine = pd.read_csv('winequality-white.csv')
wine.tail()

Unnamed: 0,FAcidity,VAcidity,Citric,Sugar,Chlorides,SugarDioxide,SulfurDioxide,Density,pH,Sulphates,Alcohol,Quality
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7
4897,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6


In [3]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression(fit_intercept=True)

In [4]:
# Extracting data
X = wine.drop(['Quality'], axis=1)
y = wine['Quality']

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
# Making a split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
# training the model
lm.fit(X_train, y_train)
print (lm.coef_, lm.intercept_);

[ 1.11843762e-01 -1.82456260e+00 -1.09491232e-02  1.04383826e-01
  2.16467479e-01  3.32388078e-03  7.98537570e-05 -2.13812331e+02
  9.14831857e-01  6.61417218e-01  1.22981002e-01] 212.8055483320244


In [8]:
# Let's make predictions
y_pred = lm.predict(X_test)

In [9]:
print ('Score of model in training group: {0:2.2f}'.format(lm.score(X_train, y_train)))
print ('Score of model in test group: {0:2.2f}'.format(lm.score(X_test, y_test)))

Score of model in training group: 0.29
Score of model in test group: 0.26


In [10]:
print (y_test[0:5])

105     6
1158    6
1690    4
766     5
2568    6
Name: Quality, dtype: int64


In [11]:
print (y_pred[0:5])

[6.05115789 5.52327682 4.79996594 5.02396403 5.53980039]


In [12]:
print (np.sum(y_test == np.round(y_pred.astype(np.int))), y_pred.size)
# Comment what is being done here

536 1225


In [13]:
mse = np.square(np.subtract(y_test.values, y_pred)).mean()
print ('Mean square error is: {0:2.2f}'.format(mse))

Mean square error is: 0.59


In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
scores = cross_val_score(lm, X, y, scoring = 'neg_mean_squared_error', cv=5)
#scores = cross_val_score(lm, X, y, cv=10)
print ('Mean square error is (neg): {0:2.2f}'.format(scores.mean()))

Mean square error is (neg): -0.58


### Using ridge regression

In [16]:
from sklearn.linear_model import Ridge

In [34]:
rlm = Ridge(alpha=1.0, fit_intercept=True)

In [35]:
rlm.fit(X_train, y_train)
print (rlm.coef_, rlm.intercept_);

[-4.92546149e-02 -1.86287461e+00 -5.34697664e-02  2.82301633e-02
 -3.94642510e-01  4.81723595e-03 -8.73415202e-04 -1.73940251e-01
  2.11328537e-01  3.64663010e-01  3.70799981e-01] 1.950126882526066


In [36]:
print ('Score of Ridge model in training group: {0:2.2f}'.format(rlm.score(X_train, y_train)))
print ('Score of Ridge model in test group: {0:2.2f}'.format(rlm.score(X_test, y_test)))

Score of Ridge model in training group: 0.27
Score of Ridge model in test group: 0.27


In [37]:
# Let's make predictions
y_rpred = rlm.predict(X_test)
rmse = np.square(np.subtract(y_test.values, y_rpred)).mean()
print ('Mean square error with Ridge is: {0:2.2f}'.format(rmse))

Mean square error with Ridge is: 0.59


In [38]:
rscores = cross_val_score(rlm, X, y, scoring = 'neg_mean_squared_error', cv=5)
print ('CV-Mean square Ridge error is (neg): {0:2.2f}'.format(rscores.mean()))

CV-Mean square Ridge error is (neg): -0.59


### Using Lasso

In [39]:
from sklearn.linear_model import Lasso

In [40]:
llm = Lasso(alpha=0.01, fit_intercept=True, max_iter=10000)
llm.fit(X_train, y_train)
print (llm.coef_, llm.intercept_);
print ('Lasso coefficients different from zero: {}'.format(np.sum(llm.coef_ != 0)))

[-0.04832289 -0.83951702  0.          0.02327581 -0.          0.00629506
 -0.00134174 -0.          0.          0.          0.35322401] 2.5323865595117967
Lasso coefficients different from zero: 6


In [41]:
print ('Score of Lasso model in training group: {0:2.2f}'.format(llm.score(X_train, y_train)))
print ('Score of Lasso model in test group: {0:2.2f}'.format(llm.score(X_test, y_test)))

Score of Lasso model in training group: 0.25
Score of Lasso model in test group: 0.24


In [42]:
# Let's make predictions
y_lpred = llm.predict(X_test)
lmse = np.square(np.subtract(y_test.values, y_lpred)).mean()
print ('Mean square error with Lasso is: {0:2.2f}'.format(lmse))

Mean square error with Lasso is: 0.61


In [43]:
lscores = cross_val_score(llm, X, y, scoring = 'neg_mean_squared_error', cv=10)
print ('CV-Mean square Lasso error is (neg): {0:2.2f}'.format(lscores.mean()))

CV-Mean square Lasso error is (neg): -0.59


### Polynomial features

In [44]:
from sklearn.preprocessing import PolynomialFeatures

In [45]:
poly = PolynomialFeatures(3)

In [46]:
Xp_train = poly.fit_transform(X_train)
Xp_test = poly.fit_transform(X_test)

In [47]:
lm.fit(Xp_train, y_train)
# Let's make a prediction
yp_predict = lm.predict(Xp_test)

In [48]:
pmse = np.square(np.subtract(y_test.values, yp_predict)).mean()
print ('Mean square error with Polynomial is: {0:2.2f}'.format(lmse))

Mean square error with Polynomial is: 0.61


### Using Stochastic Gradient Descend 

In [49]:
from sklearn.linear_model import SGDRegressor

In [50]:
gdlm = SGDRegressor(fit_intercept = True, eta0=0.01, learning_rate = 'constant', max_iter = 1000)

#### Let's scale the columns between 0 and 1

In [51]:
## Let's scale the columns between 0 and 1
for i in range(12):
    cmin = wine.iloc[:,i].min()
    cmax = wine.iloc[:,i].max()
    wine.iloc[:,i] = (wine.iloc[:,i]-cmin)/(cmax-cmin)
    
wine.head()

Unnamed: 0,FAcidity,VAcidity,Citric,Sugar,Chlorides,SugarDioxide,SulfurDioxide,Density,pH,Sulphates,Alcohol,Quality
0,0.307692,0.186275,0.216867,0.308282,0.106825,0.149826,0.37355,0.267785,0.254545,0.267442,0.129032,0.5
1,0.240385,0.215686,0.204819,0.015337,0.118694,0.041812,0.285383,0.132832,0.527273,0.313953,0.241935,0.5
2,0.413462,0.196078,0.240964,0.096626,0.121662,0.097561,0.204176,0.154039,0.490909,0.255814,0.33871,0.5
3,0.326923,0.147059,0.192771,0.121166,0.145401,0.156794,0.410673,0.163678,0.427273,0.209302,0.306452,0.5
4,0.326923,0.147059,0.192771,0.121166,0.145401,0.156794,0.410673,0.163678,0.427273,0.209302,0.306452,0.5


In [52]:
# Extracting data
X = wine.drop(['Quality'], axis=1)
y = wine['Quality']

In [53]:
#scores = cross_val_score(gdlm, X, y, cv=10)
scores = cross_val_score(gdlm, X, y, scoring='neg_mean_squared_error', cv=5)
print(scores.mean())

-0.01616297109283539


In [54]:
# Making a split
Xs_train = X.iloc[np.array(X_train.index), :]
Xs_test = X.iloc[np.array(X_test.index), :]
ys_train = y.iloc[np.array(y_train.index)]
ys_test = y.iloc[np.array(y_test.index)]

In [55]:
gdlm.fit(Xs_train, ys_train);
ys_pred = gdlm.predict(Xs_test)

In [56]:
sgmse = np.square(np.subtract(ys_test.values, ys_pred)).mean()
print ('Mean square error with Sotchastic Gradient is: {0:2.3f}'.format(sgmse))

Mean square error with Sotchastic Gradient is: 0.016


In [57]:
print (ys_test[0:5])

105     0.500000
1158    0.500000
1690    0.166667
766     0.333333
2568    0.500000
Name: Quality, dtype: float64


In [58]:
print (ys_pred[0:5])

[0.49087844 0.42504224 0.31374166 0.36475132 0.43798399]


In [59]:
gdlm.coef_

array([ 0.01677361, -0.31537438, -0.00874946,  0.60527135, -0.02186955,
        0.19456977, -0.02915065, -0.70319573,  0.08833637,  0.07060021,
        0.28605032])

Is this misleading?