In [153]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

## `Task` Select best features from from Wine Quality dataset.

* Data Link : https://www.kaggle.com/datasets/rajyellow46/wine-quality

* Drive link : https://docs.google.com/spreadsheets/d/e/2PACX-1vQDVwxneOKOaJL13QMhkAhYrgWlH1tICY7RacUnj_lL8m9uUWaaUf3p7bScNyh_D2Rvt7nc1q11adSy/pub?gid=647503637&single=true&output=csv

Note : Follow approach from Feature Selection Session - 2

In [133]:
df  = pd.read_csv('https://www.google.com/url?q=https%3A%2F%2Fdocs.google.com%2Fspreadsheets%2Fd%2Fe%2F2PACX-1vQDVwxneOKOaJL13QMhkAhYrgWlH1tICY7RacUnj_lL8m9uUWaaUf3p7bScNyh_D2Rvt7nc1q11adSy%2Fpub%3Fgid%3D647503637%26single%3Dtrue%26output%3Dcsv')

In [134]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [135]:
df.dropna(inplace=True)

In [136]:
df['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9])

In [137]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

In [138]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=20)

# Scaling

In [154]:
sc =StandardScaler()

In [None]:
X_train  = sc.fit(X_train)
X_test  = sc.fit(X_test)

# Exhaustive Feature Selection

In [139]:
lr = LinearRegression()

In [140]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [141]:
r2_score(y_pred,y_test)

-1.2866139798995935

In [142]:
efs = EFS(lr, max_features=X_train.shape[1], scoring='r2', cv=5)

In [143]:
efs.fit_transform(X_train,y_train)

Features: 2047/2047

array([[ 8.2 ,  0.22,  0.49, ...,  3.02,  0.33, 10.6 ],
       [ 8.8 ,  0.52,  0.34, ...,  3.26,  0.61,  9.5 ],
       [ 6.7 ,  0.25,  0.36, ...,  3.18,  0.5 ,  9.6 ],
       ...,
       [ 7.8 ,  0.43,  0.49, ...,  3.14,  0.35, 11.3 ],
       [ 6.4 ,  0.18,  0.28, ...,  3.25,  0.35, 10.5 ],
       [ 6.7 ,  0.5 ,  0.38, ...,  3.32,  0.54,  9.6 ]])

In [144]:
X_train = efs.transform(X_train)
X_test = efs.transform(X_test)


In [145]:
np.mean(cross_val_score(lr,X_test,y_test,cv=5,scoring='r2'))

np.float64(0.278130257094605)

# Sequential Backword Selection

In [148]:
sfs = SFS(lr,k_features='best',cv=5,scoring='r2',forward=True)

In [150]:
X_train = sfs.fit_transform(X_train,y_train)


In [151]:
X_test = sfs.transform(X_test)

In [152]:
np.mean(cross_val_score(lr,X_test,y_test,cv=5,scoring='r2'))

np.float64(0.278130257094605)

# Sequential forward Selection

In [None]:
sfs = SFS(lr,k_features='best',cv=5,scoring='r2',forward=True)
X_train = sfs.fit_transform(X_train,y_train)
X_test = sfs.transform(X_test)
np.mean(cross_val_score(lr,X_test,y_test,cv=5,scoring='r2'))