# Wrapper methods

# RFE

In [7]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

In [8]:
data = load_breast_cancer()
X=pd.DataFrame(data.data , columns= data.feature_names)
y= pd.Series(data.target)

In [9]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [10]:
svm_model = SVC(kernel='linear')

In [11]:
rfe = RFE(estimator=svm_model , n_features_to_select=10)
rfe.fit(X_train , y_train)

In [13]:
selected_features=X_train.columns[rfe.support_]
print('selected features by rfe:' , selected_features)

selected features by rfe: Index(['mean radius', 'mean concavity', 'mean concave points', 'radius error',
       'texture error', 'worst smoothness', 'worst compactness',
       'worst concavity', 'worst concave points', 'worst symmetry'],
      dtype='object')


In [14]:
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)
svm_model.fit(X_train_rfe , y_train)

In [15]:
y_pred = svm_model.predict(X_test_rfe)
accuracy = accuracy_score(y_test , y_pred)
print(f"Model accuracy with selected features: {accuracy:.2f}" )

Model accuracy with selected features: 0.93


# Forward method

In [16]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
data = load_iris()
X=data.data
y=data.target
feature_names = data.feature_names

In [18]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [21]:
selected = []
best_score = 0
for _ in range(X.shape[1]):
    best_feature = None
    for i in range(X.shape[1]):
        if i in selected:
            continue

    trial = selected + [i]
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train[:,trial] , y_train)
    pred = model.predict(X_test[:,trial])
    score = accuracy_score(y_test , pred)

    if score > best_score:
        best_score=score
        best_feature=i

    if best_feature is not None:
        selected.append(best_feature)
    else:
        break

print("selected_features:")
for i in selected:
    print(f"- {feature_names[i]}")

selected_features:
- petal width (cm)


# Embedded methods

# Lasso

In [22]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [24]:
data = fetch_california_housing()
X = pd.DataFrame(data.data , columns= data.feature_names)
y = pd.Series(data.target)

In [25]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [26]:
lasso = Lasso(alpha=0.1)

In [27]:
lasso.fit(X_train , y_train)

In [28]:
selected_features = X_train.columns[(lasso.coef_!=0)]
print("selected features by lasso:" , selected_features)

selected features by lasso: Index(['MedInc', 'HouseAge', 'Population', 'AveOccup', 'Latitude',
       'Longitude'],
      dtype='object')


In [29]:
y_pred = lasso.predict(X_test)

In [30]:
mse = mean_squared_error(y_test , y_pred)
print(f"Mean squared error with lasso selected features: {mse:.2f}" )

Mean squared error with lasso selected features: 0.60


# Filter methods

Pearson correlation

In [33]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

np.random.seed(42)
data = {
    'feature1' : np.random.rand(10),
    'feature2' : np.random.rand(10),
    'target' :np.random.rand(10)
}
df = pd.DataFrame(data)

corr1,_ = pearsonr(df['feature1'] , df['target'])
corr2,_ = pearsonr(df['feature2'] , df['target'])

print(f"pearson correlation between feature1 and target: {corr1:.2f}")
print(f"pearson correlation between feature2 and target: {corr2:.2f}")

pearson correlation between feature1 and target: -0.37
pearson correlation between feature2 and target: -0.45


In [35]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
import numpy as np

data = np.array([
    ['low' , 'yes'] , 
    ['medium' , 'no'] , 
    ['medium' , 'yes'] , 
    ['high' , 'no'] , 
    ['low' , 'yes'] ,
    ['low' , 'no']
])

label_encoder = LabelEncoder()
X= label_encoder.fit_transform(data[:,0])
y= label_encoder.fit_transform(data[:,1])
chi_scores , p_values = chi2(X.reshape(-1,1) , y)
print(f'chi2 statistic: {chi_scores[0]:.2f} , p-value:{p_values[0]:.2f}')

chi2 statistic: 0.14 , p-value:0.71


ANOVA (analysis of variance)

In [37]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import f_classif
import pandas as pd

iris = load_iris()
X=iris.data
y=iris.target

F_values , p_values = f_classif(X,y)

for i , feature in enumerate(iris.feature_names):
    print(f"{feature}:f_value = {F_values[i] : .2f}")


sepal length (cm):f_value =  119.26
sepal width (cm):f_value =  49.16
petal length (cm):f_value =  1180.16
petal width (cm):f_value =  960.01


## PCA

In [38]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [39]:
data = load_iris()
X= pd.DataFrame(data.data , columns= data.feature_names)
y=pd.DataFrame(data.target)

In [40]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [41]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)