In [1]:
from pydataset import data
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression, RFE

import pandas as pd
from wrangle import split_data

import numpy as np
from math import sqrt
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
def select_kbest(X, y, k):
    """takes in the predictors (X), the target (y), and the number of features to select (k)
    and returns the names of the top k selected features based on the SelectKBest class."""
    # parameters: f_regression stats test, give me 8 features
    f_selector = make_pipeline(MinMaxScaler(), SelectKBest(f_regression, k=k))
    f_selector.fit(X, y)

    # boolean mask of whether the column was selected or not. 
    feature_mask = f_selector[1].get_support()

    # get list of top K features. 
    f_feature = X_train.iloc[:,feature_mask].columns.tolist()
    
    print(str(len(f_feature)), 'features')
    print(f_feature)

    
def rfe(X, y, k):
    """takes in the predictors, the target, and the number of features to select.
    and return the top k features based on the RFE class."""
    rfe = RFE(LinearRegression(), k)
    X_rfe = rfe.fit_transform(X, y)
    
    mask = rfe.support_
    rfe_features = X.loc[:, mask].columns.tolist()
    
    print(str(len(rfe_features)), 'selected features')
    print(rfe_features)

In [3]:
df = data("tips")

# df.size == columns * rows
#bill must be the best feature
df['tip_percentage'] = df.tip/df.total_bill
df['price_per_person'] = df.total_bill/df['size']

y = df.pop('tip')
df = pd.get_dummies(df, drop_first=True)

X_train, X_test, X_val, y_train, y_test, y_val = split_data(df, y)

df.info()

X_train, X_test, X_val, y_train, y_test, y_val
(170, 10) (37, 10) (37, 10) (170,) (37,) (37,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   size              244 non-null    int64  
 2   tip_percentage    244 non-null    float64
 3   price_per_person  244 non-null    float64
 4   sex_Male          244 non-null    uint8  
 5   smoker_Yes        244 non-null    uint8  
 6   day_Sat           244 non-null    uint8  
 7   day_Sun           244 non-null    uint8  
 8   day_Thur          244 non-null    uint8  
 9   time_Lunch        244 non-null    uint8  
dtypes: float64(3), int64(1), uint8(6)
memory usage: 11.0 KB


In [4]:
select_kbest(X_train, y_train, k=8)
print("\n")
rfe(X_train, y_train, k=2)

8 features
['total_bill', 'size', 'tip_percentage', 'price_per_person', 'sex_Male', 'day_Sun', 'day_Thur', 'time_Lunch']


2 selected features
['size', 'tip_percentage']


In [6]:
#using tip percentage: top 2 size and tip
#the predictor swapped with the second feature, not surprising
df = pd.concat([df,y], axis=1)
y = df.pop('tip_percentage')

X_train, X_test, X_val, y_train, y_test, y_val = split_data(df, y)
print("\n")
select_kbest(X_train, y_train, k=8)
print("\n")
rfe(X_train, y_train, k=2)

X_train, X_test, X_val, y_train, y_test, y_val
(170, 10) (37, 10) (37, 10) (170,) (37,) (37,)


8 features
['total_bill', 'size', 'price_per_person', 'smoker_Yes', 'day_Sat', 'day_Sun', 'time_Lunch', 'tip']


2 selected features
['size', 'tip']


In [22]:
df2 = data("swiss")
y = df2.pop('Fertility')
df2.head(2)

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,17.0,15,12,9.96,22.2
Delemont,45.1,6,9,84.84,22.2


In [20]:
X_train, X_test, X_val, y_train, y_test, y_val = split_data(df2, y)
k=3

print("\n")
select_kbest(X_train, y_train, k=k)
print("\nRecursive feature elimination:")
rfe(X_train, y_train, k=k)

X_train, X_test, X_val, y_train, y_test, y_val
(32, 5) (8, 5) (7, 5) (32,) (8,) (7,)


3 features
['Examination', 'Education', 'Catholic']

Recursive feature elimination:
3 selected features
['Examination', 'Education', 'Infant.Mortality']
