# Feature Engineering Exercsies

In [23]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
import sklearn.preprocessing

import warnings
warnings.filterwarnings("ignore")

## 1. Load the tips dataset.

In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
df['tip_percentage'] = df.tip/df.total_bill
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


### b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = round(df.total_bill/df['size'], 2)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.15


### c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

- Total_bill, size, and price_per_person ==> predicted most important features for tip amount
- Time might be an important feature for tip %...do people for dinner usually tip more than lunch?

### d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [5]:
#drop non-numeric columns
df.drop(columns={'sex', 'smoker', 'day', 'time'}, inplace=True)

In [6]:
#split the data into train, validate, and test
train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=123)
train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=123)

train.shape, validate.shape, test.shape

((136, 5), (59, 5), (49, 5))

In [7]:
#further split the data into x_train and y-train
X_train = train.drop(columns=['tip'])
y_train = train.tip

X_validate = validate.drop(columns=['tip'])
y_validate = validate.tip

X_test = test.drop(columns=['tip'])
y_test = test.tip

In [8]:
X_train.shape, y_train.shape

((136, 4), (136,))

In [10]:
#scale the data
scaler = sklearn.preprocessing.MinMaxScaler(copy=True).fit(X_train)

x_train_scaled = scaler.transform(X_train)

x_validate_scaled = scaler.transform(X_validate)

x_test_scaled = scaler.transform(X_test)


In [11]:
# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 2 X's correlated with y
f_selector.fit(x_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

In [15]:
# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [18]:
print(f'The top 2 best features for predicting the tip amount are: {f_feature}')

The top 2 best features for predicting the tip amount are: ['total_bill', 'size']


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(x_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [25]:
print(f'The top 2 best features for predicting the tip amount are: {rfe_feature}')

The top 2 best features for predicting the tip amount are: ['total_bill', 'tip_percentage']


### e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [26]:
#split the data into train, validate, and test
train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=123)
train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=123)


#further split the data into x_train and y-train
X_train = train.drop(columns=['tip_percentage'])
y_train = train.tip_percentage

X_validate = validate.drop(columns=['tip_percentage'])
y_validate = validate.tip_percentage

X_test = test.drop(columns=['tip_percentage'])
y_test = test.tip_percentage

#scale the data
scaler = sklearn.preprocessing.MinMaxScaler(copy=True).fit(X_train)

x_train_scaled = scaler.transform(X_train)

x_validate_scaled = scaler.transform(X_validate)

x_test_scaled = scaler.transform(X_test)

In [28]:
# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 2 X's correlated with y
f_selector.fit(x_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

print(f'The top 2 best features for predicting the tip percentage are: {f_feature}')

The top 2 best features for predicting the tip percentage are: ['tip', 'price_per_person']


In [29]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(x_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

print(f'The top 2 best features for predicting the tip percentage are: {rfe_feature}')

The top 2 best features for predicting the tip percentage are: ['total_bill', 'tip']


### f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [30]:
#def select_kbest(X, y, k):
    

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).