# Feature Engineering Exercsies

In [1]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
import sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

import wrangle

import warnings
warnings.filterwarnings("ignore")

## 1. Load the tips dataset.

In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
df['tip_percentage'] = df.tip/df.total_bill
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


### b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = round(df.total_bill/df['size'], 2)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.15


### c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

- Total_bill, size, and price_per_person ==> predicted most important features for tip amount
- Time might be an important feature for tip %...do people for dinner usually tip more than lunch?

### d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [5]:
#drop non-numeric columns
df.drop(columns={'sex', 'smoker', 'day', 'time'}, inplace=True)

In [6]:
#split the data into train, validate, and test
train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=123)
train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=123)

train.shape, validate.shape, test.shape

((136, 5), (59, 5), (49, 5))

In [7]:
#further split the data into x_train and y-train
X_train = train.drop(columns=['tip'])
y_train = train.tip

X_validate = validate.drop(columns=['tip'])
y_validate = validate.tip

X_test = test.drop(columns=['tip'])
y_test = test.tip

In [8]:
X_train.shape, y_train.shape

((136, 4), (136,))

In [9]:
#scale the data
scaler = sklearn.preprocessing.MinMaxScaler(copy=True).fit(X_train)

x_train_scaled = scaler.transform(X_train)

x_validate_scaled = scaler.transform(X_validate)

x_test_scaled = scaler.transform(X_test)


In [10]:
# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 2 X's correlated with y
f_selector.fit(x_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

In [11]:
# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [12]:
print(f'The top 2 best features for predicting the tip amount are: {f_feature}')

The top 2 best features for predicting the tip amount are: ['total_bill', 'size']


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(x_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [14]:
print(f'The top 2 best features for predicting the tip amount are: {rfe_feature}')

The top 2 best features for predicting the tip amount are: ['total_bill', 'tip_percentage']


### e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [15]:
#split the data into train, validate, and test
train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=123)
train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=123)


#further split the data into x_train and y-train
X_train = train.drop(columns=['tip_percentage'])
y_train = train.tip_percentage

X_validate = validate.drop(columns=['tip_percentage'])
y_validate = validate.tip_percentage

X_test = test.drop(columns=['tip_percentage'])
y_test = test.tip_percentage

#scale the data
scaler = sklearn.preprocessing.MinMaxScaler(copy=True).fit(X_train)

x_train_scaled = scaler.transform(X_train)

x_validate_scaled = scaler.transform(X_validate)

x_test_scaled = scaler.transform(X_test)

In [16]:
# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 2 X's correlated with y
f_selector.fit(x_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

print(f'The top 2 best features for predicting the tip percentage are: {f_feature}')

The top 2 best features for predicting the tip percentage are: ['tip', 'price_per_person']


In [17]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(x_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

print(f'The top 2 best features for predicting the tip percentage are: {rfe_feature}')

The top 2 best features for predicting the tip percentage are: ['total_bill', 'tip']


### f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

- RFE builds a model and removes the weakest feature first, then continues to whittle the features down to k features. k best compares each feature against the target with statistical testing to find the pairs with the strongest relationships.

- RFE is selecting features 'in relationship' (by nature of modeling) to each other so total bill and tip are features for predicting the tip percentage.

- kbest is selecting a pair (because k was set to 2) that have the strongest relationship, individually, to the target. 

- Price per person probably correlates (should run a stats test!) to total_bill...and I hypothesize that if I enlarged k to 3, price per person whould show up for RFE and and total_bill would be returned as the 3rd feature for kbest

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [18]:
def select_kbest(x_train_scaled, y_train, k):

    f_selector = SelectKBest(f_regression, k=k)

    # find the top 2 X's correlated with y
    f_selector.fit(x_train_scaled, y_train)

    # boolean mask of whether the column was selected or not. 
    feature_mask = f_selector.get_support()

     # convert arrays to dataframes
    x_train_scaled = pd.DataFrame(x_train_scaled, columns=X_train.columns).\
                                  set_index([X_train.index.values])

    # get list of top K features. 
    f_feature = x_train_scaled.iloc[:,feature_mask].columns.tolist()

    print(f'The top 2 best features for predicting the tip percentage are: {f_feature}')
    

In [19]:
select_kbest(x_train_scaled, y_train, 2)

The top 2 best features for predicting the tip percentage are: ['tip', 'price_per_person']


## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [20]:
def ref(x_train_scaled, y_train, k):
    # initialize the ML algorithm
    lm = LinearRegression()

    # create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
    rfe = RFE(lm, k)

    # fit the data using RFE
    rfe.fit(x_train_scaled,y_train)  

    # get the mask of the columns selected
    feature_mask = rfe.support_

    # get list of the column names. 
    rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

    print(f'The best features for predicting the target are: {rfe_feature}')

In [21]:
ref(x_train_scaled, y_train, 2)

The best features for predicting the target are: ['total_bill', 'tip']


## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [22]:
#Load the data and preview the heading
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [23]:
#Check for nulls, shape, and dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [24]:
#Use train, test, split module to split the data
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle.train_validate_test_split(df, target='Fertility')

In [25]:
#Check the module did what it was supposed to do...
X_train.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,60.8,16,10,7.72,16.3
Lavaux,73.0,19,9,2.84,20.0
Nyone,50.9,22,12,15.14,16.7
Conthey,85.9,3,2,99.71,15.1
Yverdon,49.5,15,8,6.1,22.5


In [26]:
#Scale the data
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.minmax_scale_data(X_train, X_validate, X_test)

In [27]:
#Use kbest to determine thetop two best features for predicting fertility
wrangle.select_kbest(X_train_scaled, y_train, 2)

The top best features for predicting the target are: ['Examination', 'Catholic']


In [28]:
#Use RFE to determine the top two best features for predicting fertility
wrangle.rfe(X_train_scaled, y_train, 2)

The best features for predicting the target are: ['Examination', 'Infant.Mortality']
