# Feature selection

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

df.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

# Partitioning a dataset in training and test sets
X, y = df.iloc[:, 1:].values, df.iloc[:, 0].values

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=0)

# Min-Max scaling
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

# Standardization
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

## L1 Regularization

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear', penalty='l1', C=0.1, multi_class='ovr')
lr.fit(X_train_std, y_train)
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))

In [None]:
print(lr.intercept_)

In [None]:
for count, coef in enumerate(lr.coef_):
    coef_print = ", ".join(f"{i:5.2f}" for i in coef)
    print(f"(Model:{count}) Coefs: {coef_print}")

In [None]:
#Calculate weight values for different values of C
df2 = pd.DataFrame(columns=np.append(['C'], df.columns.values[1:]))
df2 = df2.set_index('C')
for c in np.arange(-4., 6.):
    C = 10.**c
    lr = LogisticRegression(solver='liblinear', penalty='l1', C=C, random_state=0, multi_class='ovr')
    lr.fit(X_train_std, y_train)
    df2.loc[C] =  lr.coef_[1]
    
display(df2)

In [None]:
# Plot the results
import seaborn as sns
cols = df2.columns.values
colors = sns.color_palette(n_colors=len(cols))
markers=['.',',','o','v','^','<','>','1','2','3','4','8','s','p','P','*','h','H','+','x','X','D','d','|','_']
markers = markers[:len(cols)]

fig, ax = plt.subplots(figsize=(10,5))
for col, color, marker in zip(cols,colors,markers):
    ax.plot(df2[col],label=col,color=color,marker=marker,markersize=8)

ax.set_xscale('log')
ax.legend(bbox_to_anchor=(1.04,1), loc="upper left")
ax.set_xlim([10.**-2, 10.**4])
ax.set_title('L1 Regularization Effect')
ax.set_xlabel('C')
ax.set_ylabel('Weight')
plt.show()

## Sequential feature selection algorithms

In [None]:
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import accuracy_score

class SBS():
    def __init__(self, estimator, k_features, scoring=accuracy_score,
                 test_size=0.25, random_state=1):
        self.scoring = scoring
        self.estimator = clone(estimator)
        self.k_features = k_features
        self.test_size = test_size
        self.random_state = random_state

    def fit(self, X, y):
        
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=self.test_size,
                             random_state=self.random_state)

        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, 
                                 X_test, y_test, self.indices_)
        self.scores_ = [score]

        while dim > self.k_features:
            scores = []
            subsets = []

            for p in combinations(self.indices_, r=dim - 1):
                score = self._calc_score(X_train, y_train, 
                                         X_test, y_test, p)
                scores.append(score)
                subsets.append(p)

            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1

            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]

        return self

    def transform(self, X):
        return X[:, self.indices_]

    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score

In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)

# selecting features
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)

# Print result
for set, score in zip(sbs.subsets_,sbs.scores_):
    setPrint = ", ".join(f"{i:1d}" for i in set)
    print(f"Accuracy = {score:3.2f}: Features = {setPrint}")

In [None]:
# Plotting performance of feature subsets
k_feat = [len(k) for k in sbs.subsets_]
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(k_feat, sbs.scores_, marker='o')
ax.set_ylim([0.7, 1.1])
ax.set_ylabel('Accuracy')
ax.set_xlabel('Number of features')
ax.grid()
plt.show()

In [None]:
# Print the chosen columns from 5 numbers of features (index=8)
k5 = list(sbs.subsets_[8])
print(k5)
print(df.columns[1:].values[k5])

In [None]:
# Compare the accuracy 
knn.fit(X_train_std, y_train)
print('Training accuracy:', knn.score(X_train_std, y_train))
print('Test accuracy:', knn.score(X_test_std, y_test))

In [None]:
knn.fit(X_train_std[:, k5], y_train)
print('Training accuracy:', knn.score(X_train_std[:, k5], y_train))
print('Test accuracy:', knn.score(X_test_std[:, k5], y_test))

## Feature Importances with Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

#Create objects 
forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=-1)

#Training
forest.fit(X_train, y_train)

In [None]:
# Extract importance measure
importances = forest.feature_importances_

# Column names
cols = df.columns[1:]

# Sort array from based on importances from large to small
idxs = np.argsort(importances)[::-1]
importances = importances[idxs]
cols = cols[idxs]

# Print results
for count, (col, importance) in enumerate(zip(cols, importances)):
    print(f"{count+1:2d}) {col:30s} \t{importance:5.3f}")

In [None]:
cols

In [None]:
#Reverse the order for plotting
cols2 = cols[::-1]
importances2 = importances[::-1]

#Plotting
fig, ax = plt.subplots(figsize=(5,8))
ax.barh(cols2,importances2,  color='lightblue')
ax.set_title('Importances by features')
plt.show()

### Using Feature Selection object in SKL

In [None]:
from sklearn.feature_selection import SelectFromModel

#Create object
sfm = SelectFromModel(forest, threshold=0.1)

# Training
sfm.fit(X_train, y_train)

# Columns chosen
cols_bool = sfm.get_support()

# Extract importances values
importances = sfm.estimator_.feature_importances_

# Select only chosen columns
cols = df.columns[1:][cols_bool]
importances = importances[cols_bool]

In [None]:
# Sort array from based on importances from large to small
idxs = np.argsort(importances)[::-1]
importances = importances[idxs]
cols = cols[idxs]

# Print results
for count, (col, importance) in enumerate(zip(cols, importances)):
    print(f"{count+1:2d}) {col:30s} \t{importance:5.3f}")

In [None]:
# Transform X
X_selected = sfm.transform(X_train)

print(X_selected.shape)