## Feature Engineering

In [181]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

from prepare import train_val_test
from acquire import get_auto_mpg

In [182]:
df = get_auto_mpg()
df.head()

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [183]:
train, val, test = train_val_test(df)
train.shape, val.shape, test.shape

((235, 9), (78, 9), (79, 9))

In [184]:
mms = MinMaxScaler()

train[['displ','horsepower','weight','acc']] = mms.fit_transform(train[['displ','horsepower','weight','acc']])

train.head()

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
212,16.5,8,0.727273,0.725275,0.784519,0.244048,76,1,"""cadillac seville"""
346,32.3,4,0.07013,0.104396,0.128154,0.583333,81,3,"""subaru"""
325,44.3,4,0.051948,0.0,0.133825,0.815476,80,2,"""vw rabbit c (diesel)"""
90,12.0,8,0.932468,0.824176,0.946697,0.208333,73,1,"""mercury marquis brougham"""
246,32.8,4,0.020779,0.021978,0.105472,0.678571,78,3,"""mazda glc deluxe"""


## SelectKBest

Uses statistical tests to compare each feature with the target variable to determine which features have the strongest relationship with the target.

SelectKBest(stats test, k=num features to return)

Fit to data

get_support() to generate mask for column names.

In [185]:
X_train_scaled = train[['displ','horsepower','weight','acc']]
y_train = train['mpg']

In [186]:
f_selector = SelectKBest(f_regression, k=2)

In [187]:
f_selector.fit(X_train_scaled, y_train)

In [188]:
f_select_mask = f_selector.get_support()

In [189]:
X_train_scaled.columns[f_select_mask]

Index(['displ', 'weight'], dtype='object')

In [190]:
X_train_scaled.iloc[:,f_select_mask]

Unnamed: 0,displ,weight
212,0.727273,0.784519
346,0.070130,0.128154
325,0.051948,0.133825
90,0.932468,0.946697
246,0.020779,0.105472
...,...,...
72,0.607792,0.646158
107,0.420779,0.333428
272,0.210390,0.352141
352,0.072727,0.217465


## RFE

Recursive feature elimination

Initialize a model

RFE(model, n_features_to_select=num features to return)

Fit to data

.support_ to get mask

.ranking_ to get ranking of features

In [191]:
train.head()

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
212,16.5,8,0.727273,0.725275,0.784519,0.244048,76,1,"""cadillac seville"""
346,32.3,4,0.07013,0.104396,0.128154,0.583333,81,3,"""subaru"""
325,44.3,4,0.051948,0.0,0.133825,0.815476,80,2,"""vw rabbit c (diesel)"""
90,12.0,8,0.932468,0.824176,0.946697,0.208333,73,1,"""mercury marquis brougham"""
246,32.8,4,0.020779,0.021978,0.105472,0.678571,78,3,"""mazda glc deluxe"""


In [192]:
X_train = train.drop(columns=['mpg','model_year','name'])
X_train.head()

Unnamed: 0,cylinders,displ,horsepower,weight,acc,origin
212,8,0.727273,0.725275,0.784519,0.244048,1
346,4,0.07013,0.104396,0.128154,0.583333,3
325,4,0.051948,0.0,0.133825,0.815476,2
90,8,0.932468,0.824176,0.946697,0.208333,1
246,4,0.020779,0.021978,0.105472,0.678571,3


In [193]:
X_train = pd.get_dummies(X_train, columns=['cylinders','origin'])
X_train.head()

Unnamed: 0,displ,horsepower,weight,acc,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,origin_1,origin_2,origin_3
212,0.727273,0.725275,0.784519,0.244048,0,0,0,0,1,1,0,0
346,0.07013,0.104396,0.128154,0.583333,0,1,0,0,0,0,0,1
325,0.051948,0.0,0.133825,0.815476,0,1,0,0,0,0,1,0
90,0.932468,0.824176,0.946697,0.208333,0,0,0,0,1,1,0,0
246,0.020779,0.021978,0.105472,0.678571,0,1,0,0,0,0,0,1


In [194]:
len(X_train.columns)

12

In [195]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=7)

In [196]:
rfe.fit(X_train, y_train)

In [197]:
ranks = rfe.ranking_
columns = X_train.columns.tolist()

In [198]:
feature_ranks = pd.DataFrame({'ranking':ranks,
                              'feature':columns})

In [199]:
feature_ranks.sort_values('ranking')

Unnamed: 0,ranking,feature
0,1,displ
1,1,horsepower
2,1,weight
4,1,cylinders_3
5,1,cylinders_4
6,1,cylinders_5
11,1,origin_3
7,2,cylinders_6
10,3,origin_2
9,4,origin_1


## Exercises

### 1. Load the tips dataset
 - Create a column named price_per_person. This should be the total bill divided by the party size.
 - Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
 - Use select k best to select the top 2 features for predicting tip amount. What are they?
 - Use recursive feature elimination to select the top 2 features for tip amount. What are they?
 - Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?


In [200]:
from pydataset import data

In [227]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [228]:
df['price_per_person'] = df['total_bill']/df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


**Most important features: day, time, total_bill, size**

In [229]:
train, val, test = train_val_test(df)
train.shape, val.shape, test.shape

((146, 8), (49, 8), (49, 8))

In [230]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 195 to 103
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        146 non-null    float64
 1   tip               146 non-null    float64
 2   sex               146 non-null    object 
 3   smoker            146 non-null    object 
 4   day               146 non-null    object 
 5   time              146 non-null    object 
 6   size              146 non-null    int64  
 7   price_per_person  146 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 10.3+ KB


In [231]:
mms = MinMaxScaler()

train[['total_bill','size','price_per_person']] = mms.fit_transform(train[['total_bill','size','price_per_person']])

train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
195,0.240346,4.0,Male,Yes,Thur,Lunch,0.2,0.311207
77,0.270084,3.08,Male,Yes,Sat,Dinner,0.2,0.349713
42,0.259876,2.54,Male,No,Sun,Dinner,0.2,0.336494
109,0.277186,3.76,Male,No,Sat,Dinner,0.2,0.358908
224,0.227031,3.0,Female,No,Fri,Lunch,0.4,0.1409


In [232]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 195 to 103
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        146 non-null    float64
 1   tip               146 non-null    float64
 2   sex               146 non-null    object 
 3   smoker            146 non-null    object 
 4   day               146 non-null    object 
 5   time              146 non-null    object 
 6   size              146 non-null    float64
 7   price_per_person  146 non-null    float64
dtypes: float64(4), object(4)
memory usage: 10.3+ KB


In [205]:
X_train_scaled = train.drop(columns=['tip'])
y_train = train['tip']

In [206]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns=['sex','smoker','day','time'])
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
195,0.240346,0.2,0.311207,0,1,0,1,0,0,0,1,0,1
77,0.270084,0.2,0.349713,0,1,0,1,0,1,0,0,1,0
42,0.259876,0.2,0.336494,0,1,1,0,0,0,1,0,1,0
109,0.277186,0.2,0.358908,0,1,1,0,0,1,0,0,1,0
224,0.227031,0.4,0.1409,1,0,1,0,1,0,0,0,0,1


In [207]:
f_selector = SelectKBest(f_regression, k=2)

In [208]:
f_selector.fit(X_train_scaled, y_train)

In [209]:
f_select_mask = f_selector.get_support()

In [210]:
X_train_scaled.iloc[:,f_select_mask].head()

Unnamed: 0,total_bill,size
195,0.240346,0.2
77,0.270084,0.2
42,0.259876,0.2
109,0.277186,0.2
224,0.227031,0.4


In [211]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)

In [212]:
rfe.fit(X_train_scaled, y_train)

In [213]:
ranks = rfe.ranking_
columns = X_train_scaled.columns.tolist()

In [214]:
feature_ranks = pd.DataFrame({'ranking':ranks,
                              'feature':columns})

In [215]:
feature_ranks.sort_values('ranking')

Unnamed: 0,ranking,feature
0,1,total_bill
2,1,price_per_person
1,2,size
12,3,time_Lunch
4,4,sex_Male
5,5,smoker_No
11,6,time_Dinner
3,7,sex_Female
9,8,day_Sun
6,9,smoker_Yes


### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [216]:
def select_kbest(df, cont, cat, y, k):
    
    '''
    This function takes a data frame, a list of continuous variables, a list of categorical variables,
    the target variable, and top number of features wanted. It scales the continuous variables and 
    creates X_train and y_train data frames. It then creates dummies for the categorical variables. After all the data has been
    manipulated it runs the SelectKBest for f_regression and returns the top k number of variables.
    '''
    
    # fitting and scaling the continuous variables
    mms = MinMaxScaler()
    df[cont] = mms.fit_transform(df[cont])
    
    # creating X_train and y_train data frames
    X_df_scaled = df.drop(columns=[y])
    y_df = df[y]
    
    # creating dummies for the categorical variables
    X_df_scaled = pd.get_dummies(X_df_scaled, columns=cat)
    
    # fitting the regression model to the data
    f_selector = SelectKBest(f_regression, k=k)
    f_selector.fit(X_df_scaled, y_df)
    
    # determining which variables are the top k variables
    f_select_mask = f_selector.get_support()
    
    # returning data frame of the only the top k variables
    return X_df_scaled.iloc[:,f_select_mask]

In [217]:
cont = ['total_bill','size','price_per_person']
cat = ['sex','smoker','day','time']

select_kbest(train, cont, cat, 'tip', 2).head()

Unnamed: 0,total_bill,size
195,0.240346,0.2
77,0.270084,0.2
42,0.259876,0.2
109,0.277186,0.2
224,0.227031,0.4


### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [218]:
def rfe(df, cont, cat, y, k):
    
    '''
    This function takes a data frame, a list of continuous variables, a list of categorical variables,
    the target variable, and top number of features wanted. It scales the continuous variables and 
    creates X_train and y_train data frames. It then creates dummies for the categorical variables.
    The function then runs the RFE function using linear regression to determine which features are best.
    It returns a data frame with each features and the ranking for the user to determine which features
    they want to use.
    '''
    
    # fitting and scaling the continuous variables
    mms = MinMaxScaler()
    df[cont] = mms.fit_transform(df[cont])
    
    # creating X_train and y_train data frames
    X_df_scaled = df.drop(columns=[y])
    y_df = df[y]
    
    # creating dummies for the categorical variables
    X_df_scaled = pd.get_dummies(X_df_scaled, columns=cat)
        
    # creating linear regressiong RFE model based on k number
    lm = LinearRegression()
    model = RFE(lm, n_features_to_select=k)
    
    # fitting model to scaled data
    model.fit(X_df_scaled, y_df)
    
    # determine rankings for each feature
    ranks = model.ranking_
    columns = X_df_scaled.columns.tolist()
    
    # creating data frame of ranking and column names
    feature_ranks = pd.DataFrame({'ranking':ranks,
                                  'feature':columns})
    
    # returns created data frame of feature rankings
    return feature_ranks.sort_values('ranking')

In [219]:
rfe(train, cont, cat, 'tip', 2)

Unnamed: 0,ranking,feature
0,1,total_bill
2,1,price_per_person
1,2,size
12,3,time_Lunch
4,4,sex_Male
6,5,smoker_Yes
3,6,sex_Female
5,7,smoker_No
7,8,day_Fri
11,9,time_Dinner


### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [220]:
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [221]:
train, val, test = train_val_test(df)
train.shape, val.shape, test.shape

((28, 6), (9, 6), (10, 6))

In [222]:
cont = ['Agriculture', 'Examination','Education','Catholic','Infant.Mortality']
cat = []

select_kbest(train, cont, cat, 'Fertility', 3).head()

Unnamed: 0,Examination,Education,Catholic
Sarine,0.40625,0.366667,0.911906
Grandson,0.4375,0.2,0.011753
Yverdon,0.375,0.2,0.040368
Herens,0.0625,0.0,1.0
Rive Droite,0.40625,0.9,0.493408


In [223]:
rfe(train, cont, cat, 'Fertility', 3)

Unnamed: 0,ranking,feature
0,1,Agriculture
2,1,Education
4,1,Infant.Mortality
3,2,Catholic
1,3,Examination
