In [1]:
import pandas as pd
from pydataset import data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [2]:
df = data('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


#### Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = df['total_bill'] / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


#### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

- total_bill, day, size, price per person

#### Use select k best to select the top 2 features for predicting tip amount. What are they?

In [5]:
X_train = df[['total_bill', 'day', 'time', 'size', 'price_per_person']] 
y_train = df['tip']


In [6]:
X_train = pd.get_dummies(X_train, columns=['day', 'time', 'size'])
X_train.head()

Unnamed: 0,total_bill,price_per_person,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
1,16.99,8.495,0,0,1,0,1,0,0,1,0,0,0,0
2,10.34,3.446667,0,0,1,0,1,0,0,0,1,0,0,0
3,21.01,7.003333,0,0,1,0,1,0,0,0,1,0,0,0
4,23.68,11.84,0,0,1,0,1,0,0,1,0,0,0,0
5,24.59,6.1475,0,0,1,0,1,0,0,0,0,1,0,0


In [7]:
mms = MinMaxScaler()

X_train[['total_bill', 'price_per_person']] = mms.fit_transform(X_train[['total_bill', 'price_per_person']])

X_train.head()

Unnamed: 0,total_bill,price_per_person,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
1,0.291579,0.322989,0,0,1,0,1,0,0,1,0,0,0,0
2,0.152283,0.032854,0,0,1,0,1,0,0,0,1,0,0,0
3,0.375786,0.237261,0,0,1,0,1,0,0,0,1,0,0,0
4,0.431713,0.51523,0,0,1,0,1,0,0,1,0,0,0,0
5,0.450775,0.188075,0,0,1,0,1,0,0,0,0,1,0,0


In [8]:
f_selector = SelectKBest(f_regression, k=2)

In [9]:
f_selector.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fed784a63a0>)

In [10]:
f_select_mask = f_selector.get_support()

In [11]:
X_train.columns[f_select_mask]

Index(['total_bill', 'size_2'], dtype='object')

In [12]:
f_selector.transform(X_train)

array([[0.29157939, 1.        ],
       [0.1522832 , 0.        ],
       [0.3757855 , 0.        ],
       [0.43171345, 1.        ],
       [0.45077503, 0.        ],
       [0.46543779, 0.        ],
       [0.11939673, 1.        ],
       [0.49874319, 0.        ],
       [0.25073314, 1.        ],
       [0.24528697, 1.        ],
       [0.15081693, 1.        ],
       [0.67427734, 0.        ],
       [0.25869292, 1.        ],
       [0.32174277, 0.        ],
       [0.24633431, 1.        ],
       [0.38772518, 1.        ],
       [0.15207373, 0.        ],
       [0.27691663, 0.        ],
       [0.29116045, 0.        ],
       [0.36824466, 0.        ],
       [0.31105991, 1.        ],
       [0.36070381, 1.        ],
       [0.2660243 , 1.        ],
       [0.761416  , 0.        ],
       [0.35085882, 1.        ],
       [0.30875576, 0.        ],
       [0.21575199, 1.        ],
       [0.20150817, 1.        ],
       [0.39023879, 1.        ],
       [0.34729786, 1.        ],
       [0.

In [13]:
X_train.iloc[:,f_select_mask]

Unnamed: 0,total_bill,size_2
1,0.291579,1
2,0.152283,0
3,0.375786,0
4,0.431713,1
5,0.450775,0
...,...,...
240,0.543779,0
241,0.505027,1
242,0.410557,1
243,0.308965,1


#### Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [14]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=7)

In [15]:
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=7)

In [16]:
ranks = rfe.ranking_
columns = X_train.columns.tolist()

In [17]:
feature_ranks = pd.DataFrame({'ranking': ranks,
              'feature': columns})

In [18]:
feature_ranks.sort_values('ranking')

Unnamed: 0,ranking,feature
0,1,total_bill
1,1,price_per_person
2,1,day_Fri
4,1,day_Sun
9,1,size_2
12,1,size_5
13,1,size_6
8,2,size_1
3,3,day_Sat
5,4,day_Thur


#### Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?


#### Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [50]:
def select_kbest(X,y,k=2):
    '''this function takes in the predictors (X) and the target (Y) and 
       the number of features to select(k) and returns the names of the 
       top k selected features based on SelectKBest class'''
    # set variable
    kbest = SelectKBest(f_regression, k=k)
    # fit it
    kbest.fit(X, y)
    # mask
    f_select_mask = kbest.get_support()
    
    return X.columns[f_select_mask]

In [51]:
select_kbest(X_train, y_train, 2)

Index(['total_bill', 'size_2'], dtype='object')

#### Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [56]:
def rfe(X,y,k=2):
    '''this function takes in the predictors, the target, and the number of 
       features to select based on the top k features on the RFE class'''
    # set the variable
    rf = RFE(LinearRegression(), n_features_to_select=k)
    #fit it
    rf.fit(X, y)
    # mask
    f_select_mask = rf.get_support()
    
    return X.columns[f_select_mask]

In [49]:
rfe(X_train, y_train, 2)

Index(['total_bill', 'price_per_person'], dtype='object')

#### Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [37]:
swiss = data('swiss')

In [38]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [53]:
X_train1 = swiss[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']] 
y_train1 = swiss['Fertility']

In [54]:
select_kbest(X_train1, y_train1, 2) 

Index(['Examination', 'Education'], dtype='object')

In [55]:
rfe(X_train1, y_train1, 2)

Index(['Education', 'Infant.Mortality'], dtype='object')